Nous allons nous baser sur les données mensuelles de vente d'une société de smartphones et gadgets:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
data = ("./Sales_Data/")
os.listdir(data)
['.DS_Store', 'all_data.csv', 'Sales_April_2019.csv', 'Sales_August_2019.csv', 'Sales_December_2019.csv', 'Sales_February_2019.csv', 'Sales_January_2019.csv', 'Sales_July_2019.csv', 'Sales_June_2019.csv', 'Sales_March_2019.csv', 'Sales_May_2019.csv', 'Sales_November_2019.csv', 'Sales_October_2019.csv', 'Sales_September_2019.csv']
# Collecter les noms des fichiers (Datasets)
files = [fichier for fichier in os.listdir(data) if fichier.endswith('.csv')]
files
['all_data.csv', 'Sales_April_2019.csv', 'Sales_August_2019.csv', 'Sales_December_2019.csv', 'Sales_February_2019.csv', 'Sales_January_2019.csv', 'Sales_July_2019.csv', 'Sales_June_2019.csv', 'Sales_March_2019.csv', 'Sales_May_2019.csv', 'Sales_November_2019.csv', 'Sales_October_2019.csv', 'Sales_September_2019.csv']
april = pd.read_csv("./Sales_Data/Sales_April_2019.csv")
april.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 18383 entries, 0 to 18382 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 18324 non-null object 1 Product 18324 non-null object 2 Quantity Ordered 18324 non-null object 3 Price Each 18324 non-null object 4 Order Date 18324 non-null object 5 Purchase Address 18324 non-null object dtypes: object(6) memory usage: 861.8+ KB
janvier = pd.read_csv(data + 'Sales_January_2019.csv')
janvier.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
---|---|---|---|---|---|---|
0 | 141234 | iPhone | 1 | 700 | 01/22/19 21:25 | 944 Walnut St, Boston, MA 02215 |
1 | 141235 | Lightning Charging Cable | 1 | 14.95 | 01/28/19 14:15 | 185 Maple St, Portland, OR 97035 |
2 | 141236 | Wired Headphones | 2 | 11.99 | 01/17/19 13:33 | 538 Adams St, San Francisco, CA 94016 |
3 | 141237 | 27in FHD Monitor | 1 | 149.99 | 01/05/19 20:33 | 738 10th St, Los Angeles, CA 90001 |
4 | 141238 | Wired Headphones | 1 | 11.99 | 01/25/19 11:59 | 387 10th St, Austin, TX 73301 |
fevrier = pd.read_csv(data + 'Sales_February_2019.csv')
fevrier.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
---|---|---|---|---|---|---|
0 | 150502 | iPhone | 1 | 700 | 02/18/19 01:35 | 866 Spruce St, Portland, ME 04101 |
1 | 150503 | AA Batteries (4-pack) | 1 | 3.84 | 02/13/19 07:24 | 18 13th St, San Francisco, CA 94016 |
2 | 150504 | 27in 4K Gaming Monitor | 1 | 389.99 | 02/18/19 09:46 | 52 6th St, New York City, NY 10001 |
3 | 150505 | Lightning Charging Cable | 1 | 14.95 | 02/02/19 16:47 | 129 Cherry St, Atlanta, GA 30301 |
4 | 150506 | AA Batteries (4-pack) | 2 | 3.84 | 02/28/19 20:32 | 548 Lincoln St, Seattle, WA 98101 |
all_data = pd.DataFrame()
for file in files:
data_interm = pd.read_csv(data + file)
all_data = pd.concat([all_data, data_interm])
all_data.head()
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
---|---|---|---|---|---|---|
0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 |
1 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 |
3 | 176560 | Google Phone | 1 | 600 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
4 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
all_data.shape
(373700, 6)
all_data.to_csv(data+'/all_data.csv',index=False)
all_data.dtypes
Order ID object Product object Quantity Ordered object Price Each object Order Date object Purchase Address object dtype: object
all_data.isnull().sum()
Order ID 1090 Product 1090 Quantity Ordered 1090 Price Each 1090 Order Date 1090 Purchase Address 1090 dtype: int64
all_data = all_data.dropna(how='all')
all_data.shape
(372610, 6)
# def day(x):
# return x.split('/')[1]
# day('12/30/19 00:01')
def month(x):
return x.split('/')[0]
#month('12/30/19 00:01')
all_data['Month'] = all_data['Order Date'].apply(month)
all_data
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | |
---|---|---|---|---|---|---|---|
0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 04 |
2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 04 |
3 | 176560 | Google Phone | 1 | 600 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 04 |
4 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 04 |
5 | 176561 | Wired Headphones | 1 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 04 |
... | ... | ... | ... | ... | ... | ... | ... |
11681 | 259353 | AAA Batteries (4-pack) | 3 | 2.99 | 09/17/19 20:56 | 840 Highland St, Los Angeles, CA 90001 | 09 |
11682 | 259354 | iPhone | 1 | 700 | 09/01/19 16:00 | 216 Dogwood St, San Francisco, CA 94016 | 09 |
11683 | 259355 | iPhone | 1 | 700 | 09/23/19 07:39 | 220 12th St, San Francisco, CA 94016 | 09 |
11684 | 259356 | 34in Ultrawide Monitor | 1 | 379.99 | 09/19/19 17:30 | 511 Forest St, San Francisco, CA 94016 | 09 |
11685 | 259357 | USB-C Charging Cable | 1 | 11.95 | 09/30/19 00:18 | 250 Meadow St, San Francisco, CA 94016 | 09 |
372610 rows × 7 columns
all_data['Month'].unique()
array(['04', '05', 'Order Date', '08', '09', '12', '01', '02', '03', '07', '06', '11', '10'], dtype=object)
'Order Date'
¶all_data = all_data[all_data['Month']!='Order Date']
all_data['Month'].unique()
array(['04', '05', '08', '09', '12', '01', '02', '03', '07', '06', '11', '10'], dtype=object)
all_data['Month'] = all_data['Month'].astype(int)
all_data['Price Each'] = all_data['Price Each'].astype(float)
all_data['Quantity Ordered'] = all_data['Quantity Ordered'].astype(float)
all_data.dtypes
Order ID object Product object Quantity Ordered float64 Price Each float64 Order Date object Purchase Address object Month int32 dtype: object
chifre_d'affaire
¶all_data["chifre_d'affaire"] = all_data["Quantity Ordered"]*all_data["Price Each"]
all_data
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | chifre_d'affaire | |
---|---|---|---|---|---|---|---|---|
0 | 176558 | USB-C Charging Cable | 2.0 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 |
2 | 176559 | Bose SoundSport Headphones | 1.0 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 |
3 | 176560 | Google Phone | 1.0 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 |
4 | 176560 | Wired Headphones | 1.0 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 |
5 | 176561 | Wired Headphones | 1.0 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
11681 | 259353 | AAA Batteries (4-pack) | 3.0 | 2.99 | 09/17/19 20:56 | 840 Highland St, Los Angeles, CA 90001 | 9 | 8.97 |
11682 | 259354 | iPhone | 1.0 | 700.00 | 09/01/19 16:00 | 216 Dogwood St, San Francisco, CA 94016 | 9 | 700.00 |
11683 | 259355 | iPhone | 1.0 | 700.00 | 09/23/19 07:39 | 220 12th St, San Francisco, CA 94016 | 9 | 700.00 |
11684 | 259356 | 34in Ultrawide Monitor | 1.0 | 379.99 | 09/19/19 17:30 | 511 Forest St, San Francisco, CA 94016 | 9 | 379.99 |
11685 | 259357 | USB-C Charging Cable | 1.0 | 11.95 | 09/30/19 00:18 | 250 Meadow St, San Francisco, CA 94016 | 9 | 11.95 |
371900 rows × 8 columns
all_data.groupby('Month')["chifre_d'affaire"].sum()
Month 1 3.644513e+06 2 4.404045e+06 3 5.614201e+06 4 6.781340e+06 5 6.305214e+06 6 5.155605e+06 7 5.295552e+06 8 4.488936e+06 9 4.195120e+06 10 7.473454e+06 11 6.399206e+06 12 9.226887e+06 Name: chifre_d'affaire, dtype: float64
from matplotlib.pyplot import figure
figure(figsize=(12, 8), dpi=80)
months = range(1,13)
plt.bar(months, all_data.groupby('Month')["chifre_d'affaire"].sum())
plt.xticks(months)
plt.ylabel("chifre_d'affaire en UDS")
plt.xlabel('Month number');
Décembre est le mois avec le plus de ventes et le CA est de 4557905.420001525
all_data['Purchase Address']
0 917 1st St, Dallas, TX 75001 2 682 Chestnut St, Boston, MA 02215 3 669 Spruce St, Los Angeles, CA 90001 4 669 Spruce St, Los Angeles, CA 90001 5 333 8th St, Los Angeles, CA 90001 ... 11681 840 Highland St, Los Angeles, CA 90001 11682 216 Dogwood St, San Francisco, CA 94016 11683 220 12th St, San Francisco, CA 94016 11684 511 Forest St, San Francisco, CA 94016 11685 250 Meadow St, San Francisco, CA 94016 Name: Purchase Address, Length: 371900, dtype: object
"917 1st St, Dallas, TX 75001".split(',')[1]
' Dallas'
def city(x):
return x.split(',')[1]
all_data['ville'] = all_data['Purchase Address'].apply(city)
all_data
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | chifre_d'affaire | ville | |
---|---|---|---|---|---|---|---|---|---|
0 | 176558 | USB-C Charging Cable | 2.0 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 | Dallas |
2 | 176559 | Bose SoundSport Headphones | 1.0 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 | Boston |
3 | 176560 | Google Phone | 1.0 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles |
4 | 176560 | Wired Headphones | 1.0 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles |
5 | 176561 | Wired Headphones | 1.0 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11681 | 259353 | AAA Batteries (4-pack) | 3.0 | 2.99 | 09/17/19 20:56 | 840 Highland St, Los Angeles, CA 90001 | 9 | 8.97 | Los Angeles |
11682 | 259354 | iPhone | 1.0 | 700.00 | 09/01/19 16:00 | 216 Dogwood St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco |
11683 | 259355 | iPhone | 1.0 | 700.00 | 09/23/19 07:39 | 220 12th St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco |
11684 | 259356 | 34in Ultrawide Monitor | 1.0 | 379.99 | 09/19/19 17:30 | 511 Forest St, San Francisco, CA 94016 | 9 | 379.99 | San Francisco |
11685 | 259357 | USB-C Charging Cable | 1.0 | 11.95 | 09/30/19 00:18 | 250 Meadow St, San Francisco, CA 94016 | 9 | 11.95 | San Francisco |
371900 rows × 9 columns
all_data['ville'].unique()
array([' Dallas', ' Boston', ' Los Angeles', ' San Francisco', ' Seattle', ' Atlanta', ' New York City', ' Portland', ' Austin'], dtype=object)
ca_by_ville = all_data.groupby('ville')['ville'].count()
ca_by_ville
ville Atlanta 29762 Austin 19810 Boston 39868 Dallas 29640 Los Angeles 59210 New York City 49752 Portland 24930 San Francisco 89464 Seattle 29464 Name: ville, dtype: int64
ca_by_ville.plot(kind='bar', figsize=(12, 8))
plt.ylabel('Chiffre daffaire en Millions de dollars')
plt.title('Chiffre daffaire par Ville')
plt.show()
from matplotlib.pyplot import figure
figure(figsize=(12, 8), dpi=80)
plt.bar(all_data.groupby('ville')['ville'].count().index,all_data.groupby('ville')['ville'].count())
plt.xticks(rotation='vertical')
plt.ylabel("Received orders")
plt.xlabel('Nom de ville');
all_data
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | chifre_d'affaire | ville | |
---|---|---|---|---|---|---|---|---|---|
0 | 176558 | USB-C Charging Cable | 2.0 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 | Dallas |
2 | 176559 | Bose SoundSport Headphones | 1.0 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 | Boston |
3 | 176560 | Google Phone | 1.0 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles |
4 | 176560 | Wired Headphones | 1.0 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles |
5 | 176561 | Wired Headphones | 1.0 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11681 | 259353 | AAA Batteries (4-pack) | 3.0 | 2.99 | 09/17/19 20:56 | 840 Highland St, Los Angeles, CA 90001 | 9 | 8.97 | Los Angeles |
11682 | 259354 | iPhone | 1.0 | 700.00 | 09/01/19 16:00 | 216 Dogwood St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco |
11683 | 259355 | iPhone | 1.0 | 700.00 | 09/23/19 07:39 | 220 12th St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco |
11684 | 259356 | 34in Ultrawide Monitor | 1.0 | 379.99 | 09/19/19 17:30 | 511 Forest St, San Francisco, CA 94016 | 9 | 379.99 | San Francisco |
11685 | 259357 | USB-C Charging Cable | 1.0 | 11.95 | 09/30/19 00:18 | 250 Meadow St, San Francisco, CA 94016 | 9 | 11.95 | San Francisco |
371900 rows × 9 columns
all_data['Hour']=pd.to_datetime(all_data['Order Date']).dt.hour
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | chifre_d'affaire | ville | Hour | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 176558 | USB-C Charging Cable | 2.0 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 | Dallas | 8 |
2 | 176559 | Bose SoundSport Headphones | 1.0 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 | Boston | 22 |
3 | 176560 | Google Phone | 1.0 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles | 14 |
4 | 176560 | Wired Headphones | 1.0 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles | 14 |
5 | 176561 | Wired Headphones | 1.0 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles | 9 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11681 | 259353 | AAA Batteries (4-pack) | 3.0 | 2.99 | 09/17/19 20:56 | 840 Highland St, Los Angeles, CA 90001 | 9 | 8.97 | Los Angeles | 20 |
11682 | 259354 | iPhone | 1.0 | 700.00 | 09/01/19 16:00 | 216 Dogwood St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco | 16 |
11683 | 259355 | iPhone | 1.0 | 700.00 | 09/23/19 07:39 | 220 12th St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco | 7 |
11684 | 259356 | 34in Ultrawide Monitor | 1.0 | 379.99 | 09/19/19 17:30 | 511 Forest St, San Francisco, CA 94016 | 9 | 379.99 | San Francisco | 17 |
11685 | 259357 | USB-C Charging Cable | 1.0 | 11.95 | 09/30/19 00:18 | 250 Meadow St, San Francisco, CA 94016 | 9 | 11.95 | San Francisco | 0 |
371900 rows × 10 columns
keys=[]
hours=[]
for key,hour in all_data.groupby('Hour'):
keys.append(key)
hours.append(len(hour))
hours
[7820, 4700, 2486, 1662, 1708, 2642, 4964, 8022, 12512, 17496, 21888, 24822, 25174, 24258, 21968, 20350, 20768, 21798, 24560, 25810, 24456, 21842, 17644, 12550]
from matplotlib.pyplot import figure
figure(figsize=(12, 8), dpi=80)
plt.grid()
plt.plot(keys,hours)
plt.xlabel('Heure de la journée')
plt.ylabel('Nombre des commandes');
Nous remarquos que l'heure où il y a plus des commandes c'est à environ 12h et 19h
all_data.groupby('Product')['Quantity Ordered'].sum().plot(kind='bar',figsize=(12,8));
all_data.groupby('Product')['Price Each'].mean()
Product 20in Monitor 109.99 27in 4K Gaming Monitor 389.99 27in FHD Monitor 149.99 34in Ultrawide Monitor 379.99 AA Batteries (4-pack) 3.84 AAA Batteries (4-pack) 2.99 Apple Airpods Headphones 150.00 Bose SoundSport Headphones 99.99 Flatscreen TV 300.00 Google Phone 600.00 LG Dryer 600.00 LG Washing Machine 600.00 Lightning Charging Cable 14.95 Macbook Pro Laptop 1700.00 ThinkPad Laptop 999.99 USB-C Charging Cable 11.95 Vareebadd Phone 400.00 Wired Headphones 11.99 iPhone 700.00 Name: Price Each, dtype: float64
# Vizuation de quantités vendus et le prix
products=all_data.groupby('Product')['Quantity Ordered'].sum().index
quantity=all_data.groupby('Product')['Quantity Ordered'].sum()
prices=all_data.groupby('Product')['Price Each'].mean()
plt.figure(figsize=(40,24))
fig,ax1=plt.subplots()
ax2=ax1.twinx()
ax1.bar(products,quantity,color='g')
ax2.plot(products,prices,'b-')
ax1.set_xticklabels(products,rotation='vertical',size=8)
[Text(0, 0, '20in Monitor'), Text(1, 0, '27in 4K Gaming Monitor'), Text(2, 0, '27in FHD Monitor'), Text(3, 0, '34in Ultrawide Monitor'), Text(4, 0, 'AA Batteries (4-pack)'), Text(5, 0, 'AAA Batteries (4-pack)'), Text(6, 0, 'Apple Airpods Headphones'), Text(7, 0, 'Bose SoundSport Headphones'), Text(8, 0, 'Flatscreen TV'), Text(9, 0, 'Google Phone'), Text(10, 0, 'LG Dryer'), Text(11, 0, 'LG Washing Machine'), Text(12, 0, 'Lightning Charging Cable'), Text(13, 0, 'Macbook Pro Laptop'), Text(14, 0, 'ThinkPad Laptop'), Text(15, 0, 'USB-C Charging Cable'), Text(16, 0, 'Vareebadd Phone'), Text(17, 0, 'Wired Headphones'), Text(18, 0, 'iPhone')]
En Bleu c'est le prix de chaque article vendu et en Vert les nombres des produits vendus. Les choses qui sont cher, ne sont pas trop vendus, mais les choses qui sont moins cher, sont les plus vendus.
df=all_data[all_data['Order ID'].duplicated(keep=False)]
df['Grouped']=df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | chifre_d'affaire | ville | Hour | Grouped | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 176558 | USB-C Charging Cable | 2.0 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 | Dallas | 8 | USB-C Charging Cable,USB-C Charging Cable |
2 | 176559 | Bose SoundSport Headphones | 1.0 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 | Boston | 22 | Bose SoundSport Headphones,Bose SoundSport Hea... |
3 | 176560 | Google Phone | 1.0 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles | 14 | Google Phone,Wired Headphones,Google Phone,Wir... |
4 | 176560 | Wired Headphones | 1.0 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles | 14 | Google Phone,Wired Headphones,Google Phone,Wir... |
5 | 176561 | Wired Headphones | 1.0 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles | 9 | Wired Headphones,Wired Headphones |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11681 | 259353 | AAA Batteries (4-pack) | 3.0 | 2.99 | 09/17/19 20:56 | 840 Highland St, Los Angeles, CA 90001 | 9 | 8.97 | Los Angeles | 20 | AAA Batteries (4-pack),AAA Batteries (4-pack) |
11682 | 259354 | iPhone | 1.0 | 700.00 | 09/01/19 16:00 | 216 Dogwood St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco | 16 | iPhone,iPhone |
11683 | 259355 | iPhone | 1.0 | 700.00 | 09/23/19 07:39 | 220 12th St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco | 7 | iPhone,iPhone |
11684 | 259356 | 34in Ultrawide Monitor | 1.0 | 379.99 | 09/19/19 17:30 | 511 Forest St, San Francisco, CA 94016 | 9 | 379.99 | San Francisco | 17 | 34in Ultrawide Monitor,34in Ultrawide Monitor |
11685 | 259357 | USB-C Charging Cable | 1.0 | 11.95 | 09/30/19 00:18 | 250 Meadow St, San Francisco, CA 94016 | 9 | 11.95 | San Francisco | 0 | USB-C Charging Cable,USB-C Charging Cable |
371900 rows × 11 columns
df['Grouped']
0 USB-C Charging Cable,USB-C Charging Cable 2 Bose SoundSport Headphones,Bose SoundSport Hea... 3 Google Phone,Wired Headphones,Google Phone,Wir... 4 Google Phone,Wired Headphones,Google Phone,Wir... 5 Wired Headphones,Wired Headphones ... 11681 AAA Batteries (4-pack),AAA Batteries (4-pack) 11682 iPhone,iPhone 11683 iPhone,iPhone 11684 34in Ultrawide Monitor,34in Ultrawide Monitor 11685 USB-C Charging Cable,USB-C Charging Cable Name: Grouped, Length: 371900, dtype: object
# Dropping duplicates (Order ID)
df2=df.drop_duplicates(subset=['Order ID'])
df2
Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | chifre_d'affaire | ville | Hour | Grouped | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 176558 | USB-C Charging Cable | 2.0 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 | Dallas | 8 | USB-C Charging Cable,USB-C Charging Cable |
2 | 176559 | Bose SoundSport Headphones | 1.0 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 | Boston | 22 | Bose SoundSport Headphones,Bose SoundSport Hea... |
3 | 176560 | Google Phone | 1.0 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles | 14 | Google Phone,Wired Headphones,Google Phone,Wir... |
5 | 176561 | Wired Headphones | 1.0 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles | 9 | Wired Headphones,Wired Headphones |
6 | 176562 | USB-C Charging Cable | 1.0 | 11.95 | 04/29/19 13:03 | 381 Wilson St, San Francisco, CA 94016 | 4 | 11.95 | San Francisco | 13 | USB-C Charging Cable,USB-C Charging Cable |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
186845 | 259353 | AAA Batteries (4-pack) | 3.0 | 2.99 | 09/17/19 20:56 | 840 Highland St, Los Angeles, CA 90001 | 9 | 8.97 | Los Angeles | 20 | AAA Batteries (4-pack),AAA Batteries (4-pack) |
186846 | 259354 | iPhone | 1.0 | 700.00 | 09/01/19 16:00 | 216 Dogwood St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco | 16 | iPhone,iPhone |
186847 | 259355 | iPhone | 1.0 | 700.00 | 09/23/19 07:39 | 220 12th St, San Francisco, CA 94016 | 9 | 700.00 | San Francisco | 7 | iPhone,iPhone |
186848 | 259356 | 34in Ultrawide Monitor | 1.0 | 379.99 | 09/19/19 17:30 | 511 Forest St, San Francisco, CA 94016 | 9 | 379.99 | San Francisco | 17 | 34in Ultrawide Monitor,34in Ultrawide Monitor |
186849 | 259357 | USB-C Charging Cable | 1.0 | 11.95 | 09/30/19 00:18 | 250 Meadow St, San Francisco, CA 94016 | 9 | 11.95 | San Francisco | 0 | USB-C Charging Cable,USB-C Charging Cable |
178437 rows × 11 columns
df2['Grouped'].value_counts()[0:5]
Lightning Charging Cable,Lightning Charging Cable 19831 AAA Batteries (4-pack),AAA Batteries (4-pack) 19826 AA Batteries (4-pack),AA Batteries (4-pack) 19809 USB-C Charging Cable,USB-C Charging Cable 19792 Wired Headphones,Wired Headphones 17208 Name: Grouped, dtype: int64
df2['Grouped'].value_counts()[0:5].plot.pie()
<AxesSubplot:ylabel='Grouped'>